set.seed(1)
require(cluster)
require(clValid)
require(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
require(xlsx)
require(ggplot2)

RGB data

Skip Data Preprocessing

Skip to Measuring Clustering Effectiveness

Skip to Boxplots of Cluster (by Feature)

Data Preprocessing

Reading in and storing dataset from file

x <- read.csv('ProjectStatistics_UNCSlideStainChar.csv')
# Copying dataset for later exporting
Final <- x
colnames(x)
##  [1] "scene_name"                         
##  [2] "scene_id"                           
##  [3] "scene_ver"                          
##  [4] "th_BG"                              
##  [5] "Layer.mean.of.Layer.1..unclassified"
##  [6] "Layer.mean.of.Layer.2..unclassified"
##  [7] "Layer.mean.of.Layer.3..unclassified"
##  [8] "th_RL1"                             
##  [9] "th_RL2"                             
## [10] "th_RL3"

Removing undesired variables, scene_name, scene_id, scene_ver

# Removing row 17:
x <- x[,-c(1,2,3)]

In order to perform cluster analysis, normalizing all columns. Analysis will be performed both on original and scaled data.

x.s <- scale(x)

Back to top

Data Analysis

Clustering and Recording Clusters

Scaled

# hierarchical
hier.s <- hclust(dist(x.s))
HIER.S <- NULL
for(i in 2:12){
  HIER.S <- cbind(HIER.S, cutree(hier.s, k = i) )
}

# kmeans
KMEANS.S <- NULL
for(i in 2:12){
  KMEANS.S <- cbind(KMEANS.S, kmeans(x.s, centers = i)$cluster )
}

# diana
dia.s <- diana(x.s)
DIA.S <- NULL
for(i in 2:12){
  DIA.S <- cbind(DIA.S, cutree(dia.s, i) )
}

# pam
PAM.S <- NULL
for(i in 2:12){
  pam <- pam(x.s, i, cluster.only = TRUE)
  PAM.S <- cbind(PAM.S, pam)
}
rm(pam)

Unscaled

# hierarchical
hier.u <- hclust(dist(x))
HIER.U <- NULL
for(i in 2:12){
  HIER.U <- cbind(HIER.U, cutree(hier.u, k = i) )
}

# kmeans
KMEANS.U <- NULL
for(i in 2:12){
  KMEANS.U <- cbind(KMEANS.U, kmeans(x, centers = i)$cluster )
}

# diana
dia.u <- diana(x)
DIA.U <- NULL
for(i in 2:12){
  DIA.U <- cbind(DIA.U, cutree(dia.u, i) )
}

# pam
PAM.U <- NULL
for(i in 2:12){
  pam <- pam(x, i, cluster.only = TRUE)
  PAM.U <- cbind(PAM.U, pam)
}
rm(pam)

Back to top

Measuring Clustering Effectiveness

Several different measures are used. All metrics but Dunn and Silhouette show more stability as the value decreases; Dunn and Silhouette show more stability as the value increases. Unfortunately, all graphs suggest either two or twelve clusters. This is not reflected in research.

Scaled

# Euclidean
Euc.s <- clValid(x.s, 2:12, clMethods = c("hierarchical", "kmeans", "diana", "pam"), validation = c("internal", "stability"), maxitems = 800, metric = "euclidean")
## Warning in clValid(x.s, 2:12, clMethods = c("hierarchical", "kmeans",
## "diana", : rownames for data not specified, using 1:nrow(data)
plot(Euc.s)

# Correlation
Cor.s <- clValid(x.s, 2:12, clMethods = c("hierarchical", "kmeans", "diana", "pam"), validation = c("internal", "stability"), maxitems = 800, metric = "correlation")
## Warning in clValid(x.s, 2:12, clMethods = c("hierarchical", "kmeans",
## "diana", : rownames for data not specified, using 1:nrow(data)
plot(Cor.s)

Unscaled

# Euclidean
Euc <- clValid(x, 2:12, clMethods = c("hierarchical", "kmeans", "diana", "pam"), validation = c("internal", "stability"), maxitems = 800, metric = "euclidean")
## Warning in clValid(x, 2:12, clMethods = c("hierarchical", "kmeans",
## "diana", : rownames for data not specified, using 1:nrow(data)
plot(Euc)

# Correlation
Cor <- clValid(x, 2:12, clMethods = c("hierarchical", "kmeans", "diana", "pam"), validation = c("internal", "stability"), maxitems = 800, metric = "correlation")
## Warning in clValid(x, 2:12, clMethods = c("hierarchical", "kmeans",
## "diana", : rownames for data not specified, using 1:nrow(data)
plot(Cor)

Attaching cluster vectors to dataset copies

colnames(HIER.S) <- colnames(HIER.U) <- colnames(KMEANS.S) <- colnames(KMEANS.U) <- colnames(DIA.S) <- colnames(DIA.U) <- colnames(PAM.S) <- colnames(PAM.U) <- c("two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve")
# Scaled
Final_Hier.s <- cbind(Final, HIER.S)
Final_Kmeans.s <- cbind(Final, KMEANS.S)
Final_Dia.s <- cbind(Final, DIA.S)
Final_Pam.s <- cbind(Final, PAM.S)
# Unscaled
Final_Hier.u <- cbind(Final, HIER.U)
Final_Kmeans.u <- cbind(Final, KMEANS.U)
Final_Dia.u <- cbind(Final, DIA.U)
Final_Pam.u <- cbind(Final, PAM.U)

Exporting dataset to .csv files

# Scaled
try(write.xlsx(Final_Hier.s, "ProjectStatistics_UNCSlideStainChar_Cluster_Hier_Scaled.xlsx"))
try(write.xlsx(Final_Kmeans.s, "ProjectStatistics_UNCSlideStainChar_Cluster_Kmeans_Scaled.xlsx"))
try(write.xlsx(Final_Dia.s, "ProjectStatistics_UNCSlideStainChar_Cluster_Dia_Scaled.xlsx"))
try(write.xlsx(Final_Pam.s, "ProjectStatistics_UNCSlideStainChar_Cluster_Pam_Scaled.xlsx"))
# Unscaled
try(write.xlsx(Final_Hier.u, "ProjectStatistics_UNCSlideStainChar_Cluster_Hier_Unscaled.xlsx"))
try(write.xlsx(Final_Kmeans.u, "ProjectStatistics_UNCSlideStainChar_Cluster_Kmeans_Unscaled.xlsx"))
try(write.xlsx(Final_Dia.u, "ProjectStatistics_UNCSlideStainChar_Cluster_Dia_Unscaled.xlsx"))
try(write.xlsx(Final_Pam.u, "ProjectStatistics_UNCSlideStainChar_Cluster_Pam_Unscaled.xlsx"))

Analysis of Clustering

Reading in from file

Hier_s <- read.xlsx( "ProjectStatistics_UNCSlideStainChar_Cluster_Hier_Scaled.xlsx", sheetIndex = 1)
Kmeans_s <- read.xlsx("ProjectStatistics_UNCSlideStainChar_Cluster_Kmeans_Scaled.xlsx", sheetIndex = 1)
Diana_s <- read.xlsx("ProjectStatistics_UNCSlideStainChar_Cluster_Dia_Scaled.xlsx", sheetIndex = 1)
Pam_s <- read.xlsx("ProjectStatistics_UNCSlideStainChar_Cluster_Pam_Scaled.xlsx", sheetIndex = 1)
Hier_u <- read.xlsx("ProjectStatistics_UNCSlideStainChar_Cluster_Hier_Unscaled.xlsx", sheetIndex = 1)
Kmeans_u <- read.xlsx("ProjectStatistics_UNCSlideStainChar_Cluster_Kmeans_Unscaled.xlsx", sheetIndex = 1)
Diana_u <- read.xlsx("ProjectStatistics_UNCSlideStainChar_Cluster_Dia_Unscaled.xlsx", sheetIndex = 1)
Pam_u <- read.xlsx("ProjectStatistics_UNCSlideStainChar_Cluster_Pam_Unscaled.xlsx", sheetIndex = 1)

Summary by Cluster

Hierarchical Scaled

K-Means Scaled

Diana Scaled

Pam Scaled

Hierarchical Unscaled

K-Means Unscaled

Diana Unscaled

Pam Unscaled

Attaching Summary as Sheets to Corresponding Files

Hierarchical Scaled

K-Means Scaled

Diana Scaled

Pam Scaled

Hierarchical Unscaled

K-Means Unscaled

Diana Unscaled

Pam Unscaled

Back to top

Creating Boxplots of Clusters (by Feature)

Box_by_Cluster <- function(x, Cluster, Cl.type = NULL){
  # factorizing all cluster assingments
  for(V_i in 12:22){
    x[,V_i] <- as.factor(x[,V_i])
  }
  
  # Printing Boxplots
  for(V_i in 5:11){
    V <- colnames(x)[V_i]
    g <- ggplot(x, aes_string(x = Cluster, y = V))
    print(g + geom_boxplot() + 
          labs(title = paste("Box Plot of",Cl.type,"Clustering"),
               x = "Cluster"))
  }
}

Box_all_clusters <- function(x, Cl.type2 = NULL){
  for(Cl.strings in c("two","three","four","five","six","seven","eight","nine","ten","eleven","twelve")){
    print(paste("Summary for",Cl.strings,"clusters."))
    Box_by_Cluster(x, Cl.strings, Cl.type = Cl.type2)
  }
}

Hierarchical Scaled

Box_all_clusters(Hier_s, "Hierarchical Scaled")
## [1] "Summary for two clusters."

## [1] "Summary for three clusters."

## [1] "Summary for four clusters."

## [1] "Summary for five clusters."

## [1] "Summary for six clusters."

## [1] "Summary for seven clusters."

## [1] "Summary for eight clusters."

## [1] "Summary for nine clusters."

## [1] "Summary for ten clusters."

## [1] "Summary for eleven clusters."

## [1] "Summary for twelve clusters."

K-Means Scaled

Box_all_clusters(Kmeans_s, "K-Means Scaled")
## [1] "Summary for two clusters."

## [1] "Summary for three clusters."

## [1] "Summary for four clusters."

## [1] "Summary for five clusters."

## [1] "Summary for six clusters."

## [1] "Summary for seven clusters."

## [1] "Summary for eight clusters."

## [1] "Summary for nine clusters."

## [1] "Summary for ten clusters."

## [1] "Summary for eleven clusters."

## [1] "Summary for twelve clusters."

Diana Scaled

Box_all_clusters(Diana_s, "Diana Scaled")
## [1] "Summary for two clusters."

## [1] "Summary for three clusters."

## [1] "Summary for four clusters."

## [1] "Summary for five clusters."

## [1] "Summary for six clusters."

## [1] "Summary for seven clusters."

## [1] "Summary for eight clusters."

## [1] "Summary for nine clusters."

## [1] "Summary for ten clusters."

## [1] "Summary for eleven clusters."

## [1] "Summary for twelve clusters."

Pam Scaled

Box_all_clusters(Pam_s, "Pam Scaled")
## [1] "Summary for two clusters."

## [1] "Summary for three clusters."

## [1] "Summary for four clusters."

## [1] "Summary for five clusters."

## [1] "Summary for six clusters."

## [1] "Summary for seven clusters."

## [1] "Summary for eight clusters."

## [1] "Summary for nine clusters."

## [1] "Summary for ten clusters."

## [1] "Summary for eleven clusters."

## [1] "Summary for twelve clusters."

Hierarchical Unscaled

Box_all_clusters(Hier_u, "Hierarchical Unscaled")
## [1] "Summary for two clusters."

## [1] "Summary for three clusters."

## [1] "Summary for four clusters."

## [1] "Summary for five clusters."

## [1] "Summary for six clusters."

## [1] "Summary for seven clusters."

## [1] "Summary for eight clusters."

## [1] "Summary for nine clusters."

## [1] "Summary for ten clusters."

## [1] "Summary for eleven clusters."

## [1] "Summary for twelve clusters."

K-Means Unscaled

Box_all_clusters(Kmeans_u, "K-Means Unscaled")
## [1] "Summary for two clusters."

## [1] "Summary for three clusters."

## [1] "Summary for four clusters."

## [1] "Summary for five clusters."

## [1] "Summary for six clusters."

## [1] "Summary for seven clusters."

## [1] "Summary for eight clusters."

## [1] "Summary for nine clusters."

## [1] "Summary for ten clusters."

## [1] "Summary for eleven clusters."

## [1] "Summary for twelve clusters."

Diana Unscaled

Box_all_clusters(Diana_u, "Diana Unscaled")
## [1] "Summary for two clusters."

## [1] "Summary for three clusters."

## [1] "Summary for four clusters."

## [1] "Summary for five clusters."

## [1] "Summary for six clusters."

## [1] "Summary for seven clusters."

## [1] "Summary for eight clusters."

## [1] "Summary for nine clusters."

## [1] "Summary for ten clusters."

## [1] "Summary for eleven clusters."

## [1] "Summary for twelve clusters."

Pam Unscaled

Box_all_clusters(Pam_u, "Pam Unscaled")
## [1] "Summary for two clusters."

## [1] "Summary for three clusters."

## [1] "Summary for four clusters."

## [1] "Summary for five clusters."

## [1] "Summary for six clusters."

## [1] "Summary for seven clusters."

## [1] "Summary for eight clusters."

## [1] "Summary for nine clusters."

## [1] "Summary for ten clusters."

## [1] "Summary for eleven clusters."

## [1] "Summary for twelve clusters."